Practice Lesson 1: Basics of Corpus Analytics

Packages

## load required libraries
library(tidyverse)
library(quanteda)
library(lexicon)
library(reshape2)
library(stringi)
library(quanteda.textplots)
library(gridExtra)

Clean workspace and set working directory

## clean workspace
rm(list=ls())
## set working directory (WD)
path <- '~/coliphi21/practice_lessons/lesson_1/src/'
setwd(path)
## check that WD is set correctly
getwd()
## [1] "/Users/lucienbaumgartner/coliphi21"

Import data

For this tutorial you can either work with your own data, or the pre-built copora provided in the /input-folder for the first practice session. The quanteda-package also contains pre-built corpora you can use. For this session, I will use the quanteda-corpus data_corpus_inaugural containing the inaugural addresses of US presidents since 1789. If you work with your own data or our other pre-built corpora, this vignette might be helpful.

df <- data_corpus_inaugural

Inspect data

## how does the corpus object look like?
df
## Corpus consisting of 59 documents and 4 docvars.
## 1789-Washington :
## "Fellow-Citizens of the Senate and of the House of Representa..."
## 
## 1793-Washington :
## "Fellow citizens, I am again called upon by the voice of my c..."
## 
## 1797-Adams :
## "When it was first perceived, in early times, that no middle ..."
## 
## 1801-Jefferson :
## "Friends and Fellow Citizens: Called upon to undertake the du..."
## 
## 1805-Jefferson :
## "Proceeding, fellow citizens, to that qualification which the..."
## 
## 1809-Madison :
## "Unwilling to depart from examples of the most revered author..."
## 
## [ reached max_ndoc ... 53 more documents ]
## summary statistics
summary(df) %>% head
## what object class is the object?
class(df)
## [1] "corpus"    "character"
## how much space does it use?
object.size(df)
## 838568 bytes
## what does data structure look like?
str(df)
##  'corpus' Named chr [1:59] "Fellow-Citizens of the Senate and of the House of Representatives:\n\nAmong the vicissitudes incident to life n"| __truncated__ "Fellow citizens, I am again called upon by the voice of my country to execute the functions of its Chief Magist"| __truncated__ "When it was first perceived, in early times, that no middle course for America remained between unlimited submi"| __truncated__ "Friends and Fellow Citizens:\n\nCalled upon to undertake the duties of the first executive office of our countr"| __truncated__ "Proceeding, fellow citizens, to that qualification which the Constitution requires before my entrance on the ch"| __truncated__ "Unwilling to depart from examples of the most revered authority, I avail myself of the occasion now presented t"| __truncated__ "About to add the solemnity of an oath to the obligations imposed by a second call to the station in which my co"| __truncated__ ...
##  - attr(*, "names")= chr [1:59] "1789-Washington" "1793-Washington" "1797-Adams" "1801-Jefferson" ...
##  - attr(*, "docvars")='data.frame':  59 obs. of  7 variables:
##   ..$ docname_ : chr [1:59] "1789-Washington" "1793-Washington" "1797-Adams" "1801-Jefferson" ...
##   ..$ docid_   : Factor w/ 59 levels "1789-Washington",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ segid_   : int [1:59] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ Year     : int [1:59] 1789 1793 1797 1801 1805 1809 1813 1817 1821 1825 ...
##   ..$ President: chr [1:59] "Washington" "Washington" "Adams" "Jefferson" ...
##   ..$ FirstName: chr [1:59] "George" "George" "John" "Thomas" ...
##   ..$ Party    : Factor w/ 6 levels "Democratic","Democratic-Republican",..: 4 4 3 2 2 2 2 2 2 2 ...
##  - attr(*, "meta")=List of 3
##   ..$ system:List of 5
##   .. ..$ package-version:Classes 'package_version', 'numeric_version'  hidden list of 1
##   .. .. ..$ : int [1:3] 2 1 2
##   .. ..$ r-version      :Classes 'R_system_version', 'package_version', 'numeric_version'  hidden list of 1
##   .. .. ..$ : int [1:3] 4 0 3
##   .. ..$ system         : Named chr [1:3] "Darwin" "x86_64" "kbenoit"
##   .. .. ..- attr(*, "names")= chr [1:3] "sysname" "machine" "user"
##   .. ..$ directory      : chr "/Users/kbenoit/Dropbox (Personal)/GitHub/quanteda/quanteda"
##   .. ..$ created        : Date[1:1], format: "2021-01-25"
##   ..$ object:List of 2
##   .. ..$ unit   : chr "documents"
##   .. ..$ summary:List of 2
##   .. .. ..$ hash: chr(0) 
##   .. .. ..$ data: NULL
##   ..$ user  :List of 6
##   .. ..$ description: chr "Transcripts of all inaugural addresses delivered by United States Presidents, from Washington 1789 onward.  Dat"| __truncated__
##   .. ..$ source     : chr "Gerhard Peters and John T. Woolley. The American Presidency Project."
##   .. ..$ url        : chr "https://www.presidency.ucsb.edu/documents/presidential-documents-archive-guidebook/inaugural-addresses"
##   .. ..$ author     : chr "(various US Presidents)"
##   .. ..$ keywords   : chr [1:5] "political" "US politics" "United States" "presidents" ...
##   .. ..$ title      : chr "US presidential inaugural address speeches"

Interacting with the data

Document variables

## the document-level variables
docvars(df) %>% head

Selecting documents

## text data: how can we look at Biden's 2021 speech?
txt <- as.character(df)
names(txt)
##  [1] "1789-Washington" "1793-Washington" "1797-Adams"      "1801-Jefferson"  "1805-Jefferson"  "1809-Madison"    "1813-Madison"    "1817-Monroe"     "1821-Monroe"     "1825-Adams"      "1829-Jackson"    "1833-Jackson"    "1837-VanBuren"   "1841-Harrison"   "1845-Polk"       "1849-Taylor"     "1853-Pierce"     "1857-Buchanan"   "1861-Lincoln"    "1865-Lincoln"    "1869-Grant"      "1873-Grant"      "1877-Hayes"      "1881-Garfield"   "1885-Cleveland"  "1889-Harrison"   "1893-Cleveland"  "1897-McKinley"   "1901-McKinley"   "1905-Roosevelt"  "1909-Taft"       "1913-Wilson"     "1917-Wilson"     "1921-Harding"    "1925-Coolidge"   "1929-Hoover"     "1933-Roosevelt"  "1937-Roosevelt"  "1941-Roosevelt"  "1945-Roosevelt"  "1949-Truman"     "1953-Eisenhower" "1957-Eisenhower" "1961-Kennedy"    "1965-Johnson"    "1969-Nixon"      "1973-Nixon"      "1977-Carter"     "1981-Reagan"     "1985-Reagan"     "1989-Bush"       "1993-Clinton"    "1997-Clinton"    "2001-Bush"       "2005-Bush"      
## [56] "2009-Obama"      "2013-Obama"      "2017-Trump"      "2021-Biden.txt"
biden <- txt[grepl('Biden', names(txt))]
cat(biden)
## Chief Justice Roberts, Vice President Harris, Speaker Pelosi, Leader Schumer, Leader McConnell, Vice President Pence, distinguished guests, and my fellow Americans.
## 
## This is America's day.
## 
## This is democracy's day.
## 
## A day of history and hope.
## 
## Of renewal and resolve.
## 
## Through a crucible for the ages America has been tested anew and America has risen to the challenge.
## 
## Today, we celebrate the triumph not of a candidate, but of a cause, the cause of democracy.
## 
## The will of the people has been heard and the will of the people has been heeded.
## 
## We have learned again that democracy is precious.
## 
## Democracy is fragile.
## 
## And at this hour, my friends, democracy has prevailed.
## 
## So now, on this hallowed ground where just days ago violence sought to shake this Capitol's very foundation, we come together as one nation, under God, indivisible, to carry out the peaceful transfer of power as we have for more than two centuries.
## 
## We look ahead in our uniquely American way – restless, bold, optimistic – and set our sights on the nation we know we can be and we must be.
## 
## I thank my predecessors of both parties for their presence here.
## 
## I thank them from the bottom of my heart.
## 
## You know the resilience of our Constitution and the strength of our nation.
## 
## As does President Carter, who I spoke to last night but who cannot be with us today, but whom we salute for his lifetime of service.
## 
## I have just taken the sacred oath each of these patriots took — an oath first sworn by George Washington.
## 
## But the American story depends not on any one of us, not on some of us, but on all of us.
## 
## On "We the People" who seek a more perfect Union.
## 
## This is a great nation and we are a good people.
## 
## Over the centuries through storm and strife, in peace and in war, we have come so far. But we still have far to go.
## 
## We will press forward with speed and urgency, for we have much to do in this winter of peril and possibility.
## 
## Much to repair.
## 
## Much to restore.
## 
## Much to heal.
## 
## Much to build.
## 
## And much to gain.
## 
## Few periods in our nation's history have been more challenging or difficult than the one we're in now.
## 
## A once-in-a-century virus silently stalks the country.
## 
## It's taken as many lives in one year as America lost in all of World War II.
## 
## Millions of jobs have been lost.
## 
## Hundreds of thousands of businesses closed.
## 
## A cry for racial justice some 400 years in the making moves us. The dream of justice for all will be deferred no longer.
## 
## A cry for survival comes from the planet itself. A cry that can't be any more desperate or any more clear.
## 
## And now, a rise in political extremism, white supremacy, domestic terrorism that we must confront and we will defeat.
## 
## To overcome these challenges – to restore the soul and to secure the future of America – requires more than words.
## 
## It requires that most elusive of things in a democracy:
## 
## Unity.
## 
## Unity.
## 
## In another January in Washington, on New Year's Day 1863, Abraham Lincoln signed the Emancipation Proclamation.
## 
## When he put pen to paper, the President said, "If my name ever goes down into history it will be for this act and my whole soul is in it."
## 
## My whole soul is in it.
## 
## Today, on this January day, my whole soul is in this:
## 
## Bringing America together.
## 
## Uniting our people.
## 
## And uniting our nation.
## 
## I ask every American to join me in this cause.
## 
## Uniting to fight the common foes we face:
## 
## Anger, resentment, hatred.
## 
## Extremism, lawlessness, violence.
## 
## Disease, joblessness, hopelessness.
## 
## With unity we can do great things. Important things.
## 
## We can right wrongs.
## 
## We can put people to work in good jobs.
## 
## We can teach our children in safe schools.
## 
## We can overcome this deadly virus.
## 
## We can reward work, rebuild the middle class, and make health care
## 
## secure for all.
## 
## We can deliver racial justice.
## 
## We can make America, once again, the leading force for good in the world.
## 
## I know speaking of unity can sound to some like a foolish fantasy.
## 
## I know the forces that divide us are deep and they are real.
## 
## But I also know they are not new.
## 
## Our history has been a constant struggle between the American ideal that we are all created equal and the harsh, ugly reality that racism, nativism, fear, and demonization have long torn us apart.
## 
## The battle is perennial.
## 
## Victory is never assured.
## 
## Through the Civil War, the Great Depression, World War, 9/11, through struggle, sacrifice, and setbacks, our "better angels" have always prevailed.
## 
## In each of these moments, enough of us came together to carry all of us forward.
## 
## And, we can do so now.
## 
## History, faith, and reason show the way, the way of unity.
## 
## We can see each other not as adversaries but as neighbors.
## 
## We can treat each other with dignity and respect.
## 
## We can join forces, stop the shouting, and lower the temperature.
## 
## For without unity, there is no peace, only bitterness and fury.
## 
## No progress, only exhausting outrage.
## 
## No nation, only a state of chaos.
## 
## This is our historic moment of crisis and challenge, and unity is the path forward.
## 
## And, we must meet this moment as the United States of America.
## 
## If we do that, I guarantee you, we will not fail.
## 
## We have never, ever, ever failed in America when we have acted together.
## 
## And so today, at this time and in this place, let us start afresh.
## 
## All of us.
## 
## Let us listen to one another.
## 
## Hear one another.
## 
## See one another.
## 
## Show respect to one another.
## 
## Politics need not be a raging fire destroying everything in its path.
## 
## Every disagreement doesn't have to be a cause for total war.
## 
## And, we must reject a culture in which facts themselves are manipulated and even manufactured.
## 
## My fellow Americans, we have to be different than this.
## 
## America has to be better than this.
## 
## And, I believe America is better than this.
## 
## Just look around.
## 
## Here we stand, in the shadow of a Capitol dome that was completed amid the Civil War, when the Union itself hung in the balance.
## 
## Yet we endured and we prevailed.
## 
## Here we stand looking out to the great Mall where Dr. King spoke of his dream.
## 
## Here we stand, where 108 years ago at another inaugural, thousands of protestors tried to block brave women from marching for the right to vote.
## 
## Today, we mark the swearing-in of the first woman in American history elected to national office – Vice President Kamala Harris.
## 
## Don't tell me things can't change.
## 
## Here we stand across the Potomac from Arlington National Cemetery, where heroes who gave the last full measure of devotion rest in eternal peace.
## 
## And here we stand, just days after a riotous mob thought they could use violence to silence the will of the people, to stop the work of our democracy, and to drive us from this sacred ground.
## 
## That did not happen.
## 
## It will never happen.
## 
## Not today.
## 
## Not tomorrow.
## 
## Not ever.
## 
## To all those who supported our campaign I am humbled by the faith you have placed in us.
## 
## To all those who did not support us, let me say this: Hear me out as we move forward. Take a measure of me and my heart.
## 
## And if you still disagree, so be it.
## 
## That's democracy. That's America. The right to dissent peaceably, within the guardrails of our Republic, is perhaps our nation's greatest strength.
## 
## Yet hear me clearly: Disagreement must not lead to disunion.
## 
## And I pledge this to you: I will be a President for all Americans.
## 
## I will fight as hard for those who did not support me as for those who did.
## 
## Many centuries ago, Saint Augustine, a saint of my church, wrote that a people was a multitude defined by the common objects of their love.
## 
## What are the common objects we love that define us as Americans?
## 
## I think I know.
## 
## Opportunity.
## 
## Security.
## 
## Liberty.
## 
## Dignity.
## 
## Respect.
## 
## Honor.
## 
## And, yes, the truth.
## 
## Recent weeks and months have taught us a painful lesson.
## 
## There is truth and there are lies.
## 
## Lies told for power and for profit.
## 
## And each of us has a duty and responsibility, as citizens, as Americans, and especially as leaders – leaders who have pledged to honor our Constitution and protect our nation — to defend the truth and to defeat the lies.
## 
## I understand that many Americans view the future with some fear and trepidation.
## 
## I understand they worry about their jobs, about taking care of their families, about what comes next.
## 
## I get it.
## 
## But the answer is not to turn inward, to retreat into competing factions, distrusting those who don't look like you do, or worship the way you do, or don't get their news from the same sources you do.
## 
## We must end this uncivil war that pits red against blue, rural versus urban, conservative versus liberal.
## 
## We can do this if we open our souls instead of hardening our hearts.
## 
## If we show a little tolerance and humility.
## 
## If we're willing to stand in the other person's shoes just for a moment.
## 
## Because here is the thing about life: There is no accounting for what fate will deal you.
## 
## There are some days when we need a hand.
## 
## There are other days when we're called on to lend one.
## 
## That is how we must be with one another.
## 
## And, if we are this way, our country will be stronger, more prosperous, more ready for the future.
## 
## My fellow Americans, in the work ahead of us, we will need each other.
## 
## We will need all our strength to persevere through this dark winter.
## 
## We are entering what may well be the toughest and deadliest period of the virus.
## 
## We must set aside the politics and finally face this pandemic as one nation.
## 
## I promise you this: as the Bible says weeping may endure for a night but joy cometh in the morning.
## 
## We will get through this, together
## 
## The world is watching today.
## 
## So here is my message to those beyond our borders: America has been tested and we have come out stronger for it.
## 
## We will repair our alliances and engage with the world once again.
## 
## Not to meet yesterday's challenges, but today's and tomorrow's.
## 
## We will lead not merely by the example of our power but by the power of our example.
## 
## We will be a strong and trusted partner for peace, progress, and security.
## 
## We have been through so much in this nation.
## 
## And, in my first act as President, I would like to ask you to join me in a moment of silent prayer to remember all those we lost this past year to the pandemic.
## 
## To those 400,000 fellow Americans – mothers and fathers, husbands and wives, sons and daughters, friends, neighbors, and co-workers.
## 
## We will honor them by becoming the people and nation we know we can and should be.
## 
## Let us say a silent prayer for those who lost their lives, for those they left behind, and for our country.
## 
## Amen.
## 
## This is a time of testing.
## 
## We face an attack on democracy and on truth.
## 
## A raging virus.
## 
## Growing inequity.
## 
## The sting of systemic racism.
## 
## A climate in crisis.
## 
## America's role in the world.
## 
## Any one of these would be enough to challenge us in profound ways.
## 
## But the fact is we face them all at once, presenting this nation with the gravest of responsibilities.
## 
## Now we must step up.
## 
## All of us.
## 
## It is a time for boldness, for there is so much to do.
## 
## And, this is certain.
## 
## We will be judged, you and I, for how we resolve the cascading crises of our era.
## 
## Will we rise to the occasion?
## 
## Will we master this rare and difficult hour?
## 
## Will we meet our obligations and pass along a new and better world for our children?
## 
## I believe we must and I believe we will.
## 
## And when we do, we will write the next chapter in the American story.
## 
## It's a story that might sound something like a song that means a lot to me.
## 
## It's called "American Anthem" and there is one verse stands out for me:
## 
## "The work and prayers
## 
## of centuries have brought us to this day
## 
## What shall be our legacy?
## 
## What will our children say?…
## 
## Let me know in my heart
## 
## When my days are through
## 
## America
## 
## America
## 
## I gave my best to you."
## 
## Let us add our own work and prayers to the unfolding story of our nation.
## 
## If we do this then when our days are through our children and our children's children will say of us they gave their best.
## 
## They did their duty.
## 
## They healed a broken land.
## 
## My fellow Americans, I close today where I began, with a sacred oath.
## 
## Before God and all of you I give you my word.
## 
## I will always level with you.
## 
## I will defend the Constitution.
## 
## I will defend our democracy.
## 
## I will defend America.
## 
## I will give my all in your service thinking not of power, but of possibilities.
## 
## Not of personal interest, but of the public good.
## 
## And together, we shall write an American story of hope, not fear.
## 
## Of unity, not division.
## 
## Of light, not darkness.
## 
## An American story of decency and dignity.
## 
## Of love and of healing.
## 
## Of greatness and of goodness.
## 
## May this be the story that guides us.
## 
## The story that inspires us.
## 
## The story that tells ages yet to come that we answered the call of history.
## 
## We met the moment.
## 
## That democracy and hope, truth and justice, did not die on our watch but thrived.
## 
## That our America secured liberty at home and stood once again as a beacon to the world.
## 
## That is what we owe our forebearers, one another, and generations to follow.
## 
## So, with purpose and resolve we turn to the tasks of our time.
## 
## Sustained by faith.
## 
## Driven by conviction.
## 
## And, devoted to one another and to this country we love with all our hearts.
## 
## May God bless America and may God protect our troops.
## 
## Thank you, America.
# select Washington's 1789 speech to compare
cat(txt['1789-Washington'])
## Fellow-Citizens of the Senate and of the House of Representatives:
## 
## Among the vicissitudes incident to life no event could have filled me with greater anxieties than that of which the notification was transmitted by your order, and received on the 14th day of the present month. On the one hand, I was summoned by my Country, whose voice I can never hear but with veneration and love, from a retreat which I had chosen with the fondest predilection, and, in my flattering hopes, with an immutable decision, as the asylum of my declining years  -  a retreat which was rendered every day more necessary as well as more dear to me by the addition of habit to inclination, and of frequent interruptions in my health to the gradual waste committed on it by time. On the other hand, the magnitude and difficulty of the trust to which the voice of my country called me, being sufficient to awaken in the wisest and most experienced of her citizens a distrustful scrutiny into his qualifications, could not but overwhelm with despondence one who (inheriting inferior endowments from nature and unpracticed in the duties of civil administration) ought to be peculiarly conscious of his own deficiencies. In this conflict of emotions all I dare aver is that it has been my faithful study to collect my duty from a just appreciation of every circumstance by which it might be affected. All I dare hope is that if, in executing this task, I have been too much swayed by a grateful remembrance of former instances, or by an affectionate sensibility to this transcendent proof of the confidence of my fellow citizens, and have thence too little consulted my incapacity as well as disinclination for the weighty and untried cares before me, my error will be palliated by the motives which mislead me, and its consequences be judged by my country with some share of the partiality in which they originated.
## 
## Such being the impressions under which I have, in obedience to the public summons, repaired to the present station, it would be peculiarly improper to omit in this first official act my fervent supplications to that Almighty Being who rules over the universe, who presides in the councils of nations, and whose providential aids can supply every human defect, that His benediction may consecrate to the liberties and happiness of the people of the United States a Government instituted by themselves for these essential purposes, and may enable every instrument employed in its administration to execute with success the functions allotted to his charge. In tendering this homage to the Great Author of every public and private good, I assure myself that it expresses your sentiments not less than my own, nor those of my fellow citizens at large less than either. No people can be bound to acknowledge and adore the Invisible Hand which conducts the affairs of men more than those of the United States. Every step by which they have advanced to the character of an independent nation seems to have been distinguished by some token of providential agency; and in the important revolution just accomplished in the system of their united government the tranquil deliberations and voluntary consent of so many distinct communities from which the event has resulted can not be compared with the means by which most governments have been established without some return of pious gratitude, along with an humble anticipation of the future blessings which the past seem to presage. These reflections, arising out of the present crisis, have forced themselves too strongly on my mind to be suppressed. You will join with me, I trust, in thinking that there are none under the influence of which the proceedings of a new and free government can more auspiciously commence.
## 
## By the article establishing the executive department it is made the duty of the President "to recommend to your consideration such measures as he shall judge necessary and expedient." The circumstances under which I now meet you will acquit me from entering into that subject further than to refer to the great constitutional charter under which you are assembled, and which, in defining your powers, designates the objects to which your attention is to be given. It will be more consistent with those circumstances, and far more congenial with the feelings which actuate me, to substitute, in place of a recommendation of particular measures, the tribute that is due to the talents, the rectitude, and the patriotism which adorn the characters selected to devise and adopt them. In these honorable qualifications I behold the surest pledges that as on one side no local prejudices or attachments, no separate views nor party animosities, will misdirect the comprehensive and equal eye which ought to watch over this great assemblage of communities and interests, so, on another, that the foundation of our national policy will be laid in the pure and immutable principles of private morality, and the preeminence of free government be exemplified by all the attributes which can win the affections of its citizens and command the respect of the world. I dwell on this prospect with every satisfaction which an ardent love for my country can inspire, since there is no truth more thoroughly established than that there exists in the economy and course of nature an indissoluble union between virtue and happiness; between duty and advantage; between the genuine maxims of an honest and magnanimous policy and the solid rewards of public prosperity and felicity; since we ought to be no less persuaded that the propitious smiles of Heaven can never be expected on a nation that disregards the eternal rules of order and right which Heaven itself has ordained; and since the preservation of the sacred fire of liberty and the destiny of the republican model of government are justly considered, perhaps, as deeply, as finally, staked on the experiment entrusted to the hands of the American people.
## 
## Besides the ordinary objects submitted to your care, it will remain with your judgment to decide how far an exercise of the occasional power delegated by the fifth article of the Constitution is rendered expedient at the present juncture by the nature of objections which have been urged against the system, or by the degree of inquietude which has given birth to them. Instead of undertaking particular recommendations on this subject, in which I could be guided by no lights derived from official opportunities, I shall again give way to my entire confidence in your discernment and pursuit of the public good; for I assure myself that whilst you carefully avoid every alteration which might endanger the benefits of an united and effective government, or which ought to await the future lessons of experience, a reverence for the characteristic rights of freemen and a regard for the public harmony will sufficiently influence your deliberations on the question how far the former can be impregnably fortified or the latter be safely and advantageously promoted.
## 
## To the foregoing observations I have one to add, which will be most properly addressed to the House of Representatives. It concerns myself, and will therefore be as brief as possible. When I was first honored with a call into the service of my country, then on the eve of an arduous struggle for its liberties, the light in which I contemplated my duty required that I should renounce every pecuniary compensation. From this resolution I have in no instance departed; and being still under the impressions which produced it, I must decline as inapplicable to myself any share in the personal emoluments which may be indispensably included in a permanent provision for the executive department, and must accordingly pray that the pecuniary estimates for the station in which I am placed may during my continuance in it be limited to such actual expenditures as the public good may be thought to require.
## 
## Having thus imparted to you my sentiments as they have been awakened by the occasion which brings us together, I shall take my present leave; but not without resorting once more to the benign Parent of the Human Race in humble supplication that, since He has been pleased to favor the American people with opportunities for deliberating in perfect tranquillity, and dispositions for deciding with unparalleled unanimity on a form of government for the security of their union and the advancement of their happiness, so His divine blessing may be equally conspicuous in the enlarged views, the temperate consultations, and the wise measures on which the success of this Government must depend.

Word tokens and the document-term matrix

## word tokenization
?tokens
toks <- tokens(df, remove_punct = T, remove_symbols = T, padding = F)
toks
## Tokens consisting of 59 documents and 4 docvars.
## 1789-Washington :
##  [1] "Fellow-Citizens" "of"              "the"             "Senate"          "and"             "of"              "the"             "House"           "of"              "Representatives" "Among"           "the"            
## [ ... and 1,418 more ]
## 
## 1793-Washington :
##  [1] "Fellow"   "citizens" "I"        "am"       "again"    "called"   "upon"     "by"       "the"      "voice"    "of"       "my"      
## [ ... and 123 more ]
## 
## 1797-Adams :
##  [1] "When"      "it"        "was"       "first"     "perceived" "in"        "early"     "times"     "that"      "no"        "middle"    "course"   
## [ ... and 2,306 more ]
## 
## 1801-Jefferson :
##  [1] "Friends"   "and"       "Fellow"    "Citizens"  "Called"    "upon"      "to"        "undertake" "the"       "duties"    "of"        "the"      
## [ ... and 1,714 more ]
## 
## 1805-Jefferson :
##  [1] "Proceeding"    "fellow"        "citizens"      "to"            "that"          "qualification" "which"         "the"           "Constitution"  "requires"      "before"        "my"           
## [ ... and 2,154 more ]
## 
## 1809-Madison :
##  [1] "Unwilling" "to"        "depart"    "from"      "examples"  "of"        "the"       "most"      "revered"   "authority" "I"         "avail"    
## [ ... and 1,163 more ]
## 
## [ reached max_ndoc ... 53 more documents ]
## document-term matrix
dfx <- dfm(toks)
dfx
## Document-feature matrix of: 59 documents, 9,422 features (91.89% sparse) and 4 docvars.
##                  features
## docs              fellow-citizens  of the senate and house representatives among vicissitudes incident
##   1789-Washington               1  71 116      1  48     2               2     1            1        1
##   1793-Washington               0  11  13      0   2     0               0     0            0        0
##   1797-Adams                    3 140 163      1 130     0               2     4            0        0
##   1801-Jefferson                2 104 130      0  81     0               0     1            0        0
##   1805-Jefferson                0 101 143      0  93     0               0     7            0        0
##   1809-Madison                  1  69 104      0  43     0               0     0            0        0
## [ reached max_ndoc ... 53 more documents, reached max_nfeat ... 9,412 more features ]

Top features

## top 10 features for every document
topfeatures(dfx, n = 10, groups = docnames(dfx))
## $`1789-Washington`
##   the    of   and    to which    in     i    be    my    by 
##   116    71    48    48    36    31    23    23    22    20 
## 
## $`1793-Washington`
##   the    of     i    to    in shall   and    by    my    it 
##    13    11     6     5     3     3     2     2     2     2 
## 
## $`1797-Adams`
## the  of and  to   a  in  it  be  by  if 
## 163 140 130  72  51  47  34  31  30  25 
## 
## $`1801-Jefferson`
##   the    of   and    to which  that    in   our     i     a 
##   130   104    81    61    25    24    24    24    21    21 
## 
## $`1805-Jefferson`
##   the    of   and    to  that    in  with their  them  have 
##   143   101    93    83    37    35    28    28    27    24 
## 
## $`1809-Madison`
##   the    of    to   and    in     a    as which    by     i 
##   104    69    61    43    34    19    15    14    11    11 
## 
## $`1813-Madison`
##   the    of   and    to     a    on   our    in    it which 
##   100    65    44    42    25    22    22    21    18    16 
## 
## $`1817-Monroe`
## the  of  to and  in our   a  it  be  is 
## 275 164 126 122  79  65  61  57  50  41 
## 
## $`1821-Monroe`
##   the    of    to   and    in     a which    it    be   our 
##   360   197   146   141   136    76    66    64    64    60 
## 
## $`1825-Adams`
##  the   of  and   to   in   by have that  our been 
##  304  245  116  101   62   38   36   36   36   29 
## 
## $`1829-Jackson`
##   the    of    to   and    in  that   our     a    be their 
##    92    71    53    49    24    21    18    16    16    16 
## 
## $`1833-Jackson`
##   the    of   and    to    in   our    my     a which   all 
##   101    76    53    46    23    19    18    15    14    14 
## 
## $`1837-VanBuren`
##  the   of  and   to   in that  our    a   it    i 
##  252  198  150  139   76   60   60   59   42   39 
## 
## $`1841-Harrison`
##   the    of    to   and    in  that     a    it which    be 
##   829   604   318   231   173   132   132   111   107   106 
## 
## $`1845-Polk`
##  the   of  and   to  our   in   be    a   it that 
##  397  298  189  184  101   87   76   65   54   47 
## 
## $`1849-Taylor`
##   the    of    to   and    in     i    by    be shall   our 
##    99    62    61    52    20    18    17    16    15    15 
## 
## $`1853-Pierce`
##   the    of   and    to     a    in    be  that which    it 
##   230   169   130   107    62    60    57    46    41    34 
## 
## $`1857-Buchanan`
##  the   of   to  and   in    a this  our   it   is 
##  238  139  105   97   61   58   39   35   32   32 
## 
## $`1861-Lincoln`
##  the   of   to  and   in   be that   it    a   is 
##  256  146  134  105   77   76   59   59   56   49 
## 
## $`1865-Lincoln`
##   the    to   and    of    it  that   war   all which    in 
##    58    27    24    22    13    12    12    10     9     9 
## 
## $`1869-Grant`
##  the   to   of  and   in   be    i    a   it will 
##   83   57   47   27   27   25   19   19   16   16 
## 
## $`1873-Grant`
##  the   of  and   to   in    i   my    a that   be 
##  106   72   50   49   26   25   21   21   20   19 
## 
## $`1877-Hayes`
##  the   of  and   to   in    a that   be   by   as 
##  240  166  102   88   63   41   39   32   26   26 
## 
## $`1881-Garfield`
##  the   of  and   to   in   is that    a   it  our 
##  317  181  119   80   49   37   35   35   35   35 
## 
## $`1885-Cleveland`
##   the    of   and    to    in     a   our their    is    be 
##   174   117   103    57    31    30    26    22    19    18 
## 
## $`1889-Harrison`
##  the   of  and   to   in  our that    a  not   be 
##  360  240  192  133   80   76   66   65   46   45 
## 
## $`1893-Cleveland`
##   the    of   and    to   our    in    be which  that    by 
##   156   119   102    79    46    36    25    23    21    21 
## 
## $`1897-McKinley`
## the  of and  to  in  be our   a  it  is 
## 345 228 171 113  81  65  60  57  56  46 
## 
## $`1901-McKinley`
## the  of and  to  in  we  be  it our for 
## 200 110  97  65  42  28  27  26  25  23 
## 
## $`1905-Roosevelt`
##   the    of   and    we    to    in   our     a which  have 
##    65    45    38    32    28    23    22    20    16    15 
## 
## $`1909-Taft`
## the  of and  to  in   a  be  is  as  it 
## 486 314 220 218 140 109  79  62  58  56 
## 
## $`1913-Wilson`
##  the   of  and   to   we  our   in   it    a have 
##  109   87   78   49   40   30   29   29   27   25 
## 
## $`1917-Wilson`
##  the  and   of   we   to   in  our that have   be 
##   94   77   76   47   46   36   33   29   27   22 
## 
## $`1921-Harding`
## the  of and  to  we our  in for   a  is 
## 200 159 152 104  80  68  63  52  47  47 
## 
## $`1925-Coolidge`
##  the   of  and   to   we    a   in that   is  not 
##  261  207  146  135   88   77   71   65   65   61 
## 
## $`1929-Hoover`
##  the   of  and   to   in  our    a   is  for that 
##  288  250  122  100   83   75   49   48   44   39 
## 
## $`1933-Roosevelt`
##  the   of  and   to   in    a that  our   we   it 
##  130  109   58   50   44   38   32   29   26   25 
## 
## $`1937-Roosevelt`
##   of  the   to  and   we    a that  our   in have 
##  106  106   56   53   47   39   33   33   29   21 
## 
## $`1941-Roosevelt`
##  the   of  and   to   in   we    a   it   is that 
##  114   81   47   36   35   32   31   28   24   23 
## 
## $`1945-Roosevelt`
##  the   we   of  and   to that  our    a   in   it 
##   27   26   25   21   16   14   14   13   11    7 
## 
## $`1949-Truman`
##  the  and   of   to   we   in that    a  our  for 
##  141  100   96   81   59   56   37   36   32   30 
## 
## $`1953-Eisenhower`
##  the   of  and   to   we   in  our that this    a 
##  171  142  101   81   66   65   58   40   37   33 
## 
## $`1957-Eisenhower`
## the  of and  we  to  in our all   a  is 
## 114  96  64  51  44  43  38  26  25  20 
## 
## $`1961-Kennedy`
##  the   of   to  and   we    a   in  our that  not 
##   86   65   43   41   30   29   26   21   20   19 
## 
## $`1965-Johnson`
##  the  and   of   to   in   we    a  our that   is 
##   77   65   57   37   36   34   33   32   27   27 
## 
## $`1969-Nixon`
##  the   of   to   we   in  our that  and   as    a 
##  136   94   69   65   61   47   42   39   34   31 
## 
## $`1973-Nixon`
##  the   of   to   in  and   we    a that  for  our 
##   83   68   65   58   50   47   35   33   32   32 
## 
## $`1977-Carter`
##  the  and   to   we  our   of    a  for that   in 
##   53   48   44   43   35   33   29   24   23   22 
## 
## $`1981-Reagan`
##  the  and   of   to   we  our    a   in that will 
##  122   92   90   80   57   56   46   45   34   33 
## 
## $`1985-Reagan`
## the and  of  to  we   a our  in for  is 
## 130 110  95  73  68  59  55  46  35  33 
## 
## $`1989-Bush`
## the and   a  to  of  we  is our  in are 
## 121  98  73  63  61  60  49  44  38  36 
## 
## $`1993-Clinton`
##   the   and   our    we    to    of    in    is   for world 
##    89    66    57    52    49    46    31    28    20    18 
## 
## $`1997-Clinton`
##  the   of  and   to  our    a   we   in  new that 
##  133   96   94   64   63   59   42   35   29   27 
## 
## $`2001-Bush`
## and  of the our  we   a  to  in  is not 
##  82  58  53  50  47  46  45  31  31  27 
## 
## $`2005-Bush`
##  the   of  and   in  our   to   we   is that    a 
##  142  116  108   51   50   38   37   30   28   27 
## 
## $`2009-Obama`
##  the  and   of   to  our   we that    a   is   in 
##  135  111   82   70   67   62   49   47   36   25 
## 
## $`2013-Obama`
##  the  and  our   of   we   to that    a  for   is 
##  104   89   76   69   68   66   55   37   28   25 
## 
## $`2017-Trump`
##     and     the      of     our      we    will      to      is america       a 
##      77      71      48      47      46      40      36      21      18      15 
## 
## $`2021-Biden.txt`
##  the  and   we   of   to    a  our   in this    i 
##  101   96   88   77   65   46   43   42   39   33
## ugh, not very informative...
## let's remove stopwords before creating a document-term matrix
## this is done during tokenization
stopwords('en')
##   [1] "i"          "me"         "my"         "myself"     "we"         "our"        "ours"       "ourselves"  "you"        "your"       "yours"      "yourself"   "yourselves" "he"         "him"        "his"        "himself"    "she"        "her"        "hers"       "herself"    "it"         "its"        "itself"     "they"       "them"       "their"      "theirs"     "themselves" "what"       "which"      "who"        "whom"       "this"       "that"       "these"      "those"      "am"         "is"         "are"        "was"        "were"       "be"         "been"       "being"      "have"       "has"        "had"        "having"     "do"         "does"       "did"        "doing"      "would"      "should"     "could"      "ought"      "i'm"        "you're"     "he's"       "she's"      "it's"       "we're"      "they're"    "i've"       "you've"     "we've"      "they've"    "i'd"        "you'd"      "he'd"       "she'd"      "we'd"       "they'd"     "i'll"       "you'll"    
##  [77] "he'll"      "she'll"     "we'll"      "they'll"    "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"     "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"     "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"      "cannot"     "couldn't"   "mustn't"    "let's"      "that's"     "who's"      "what's"     "here's"     "there's"    "when's"     "where's"    "why's"      "how's"      "a"          "an"         "the"        "and"        "but"        "if"         "or"         "because"    "as"         "until"      "while"      "of"         "at"         "by"         "for"        "with"       "about"      "against"    "between"    "into"       "through"    "during"     "before"     "after"      "above"      "below"      "to"         "from"       "up"         "down"       "in"         "out"        "on"         "off"        "over"       "under"      "again"      "further"    "then"       "once"       "here"       "there"      "when"       "where"     
## [153] "why"        "how"        "all"        "any"        "both"       "each"       "few"        "more"       "most"       "other"      "some"       "such"       "no"         "nor"        "not"        "only"       "own"        "same"       "so"         "than"       "too"        "very"       "will"
sel_toks <- tokens_select(toks, pattern = stopwords("en"), selection = "remove")
dfx <- dfm(sel_toks)
# again: 10 features for every document, now without stopwords
topfeatures(dfx, n = 10, groups = docnames(dfx))
## $`1789-Washington`
##        can      every government     public        may    present    country        one   citizens       duty 
##          9          9          8          6          6          5          5          4          4          4 
## 
## $`1793-Washington`
##          shall            now           oath        present        country          voice         called       citizens administration     confidence 
##              3              2              2              1              1              1              1              1              1              1 
## 
## $`1797-Adams`
##       people   government          may      nations      country          can       states       nation constitution      foreign 
##           20           16           13           11            9            9            9            9            8            8 
## 
## $`1801-Jefferson`
## government         us        may        let        one      shall  principle        man   citizens     fellow 
##         12         10          8          7          6          6          6          6          5          5 
## 
## $`1805-Jefferson`
##   public citizens      may   fellow    state    among    shall       us      can     time 
##       14       10       10        8        8        7        7        7        6        6 
## 
## $`1809-Madison`
##     public    nations        can    country       well     states     rights      peace confidence     united 
##          6          6          5          4          4          4          4          4          3          3 
## 
## $`1813-Madison`
##      war  country    every   united  british   states   nation  without   spirit citizens 
##       15        5        5        5        5        4        4        4        4        3 
## 
## $`1817-Monroe`
##     states government      great     people      every         us     united       just        may      union 
##         21         21         21         15         14         14         13         10         10         10 
## 
## $`1821-Monroe`
##      great     states     united        war        may       made   citizens      every government     people 
##         29         20         16         16         15         15         14         13         12         11 
## 
## $`1825-Adams`
##      union government       upon     rights    country     public      great      peace      first     nation 
##         20         17         16         10          9          9          9          9          8          8 
## 
## $`1829-Jackson`
##     public government      shall        can      power        may     people   national      whose     duties 
##          8          6          6          5          5          4          4          4          3          3 
## 
## $`1833-Jackson`
##   government       people        union       states       powers         upon      general          may       united preservation 
##           13            9            9            8            5            5            5            4            4            4 
## 
## $`1837-VanBuren`
##        every       people institutions   government      country         upon           us          may          can        never 
##           20           20           16           15           13           13           12           11            9            8 
## 
## $`1841-Harrison`
##        power       people   government constitution          may         upon          one          can    executive       states 
##           47           38           36           36           34           34           26           26           25           24 
## 
## $`1845-Polk`
##   government       states        union          one       people       powers constitution      country    interests         upon 
##           45           36           32           19           16           16           15           14           14           14 
## 
## $`1849-Taylor`
##        shall   government      country       duties          may    interests constitution           us     congress          day 
##           15            7            6            5            4            4            4            4            4            3 
## 
## $`1853-Pierce`
##       upon        can      power government      every        may      shall       must         us     states 
##         24         14         11         10          9          9          9          9          9          8 
## 
## $`1857-Buchanan`
##       states        shall constitution          may       people   government        great     question      country       public 
##           22           18           17           15           13           13           11           11            9            9 
## 
## $`1861-Lincoln`
##          can constitution       people        union       states   government        shall          now         upon          one 
##           28           24           20           20           19           18           17           15           15           14 
## 
## $`1865-Lincoln`
##   war shall   god years union   let   may right  must    us 
##    12     5     5     4     4     4     3     3     3     3 
## 
## $`1869-Grant`
## country     now   every  public     may without      us    laws    best  dollar 
##       8       8       6       5       5       5       5       5       5       5 
## 
## $`1873-Grant`
##    country     people       made       best      great     office        one        can government       good 
##          8          7          6          6          5          5          4          4          4          4 
## 
## $`1877-Hayes`
##    country government       upon     public     states  political     people      great      party   citizens 
##         20         15         15         11         11         10          9          9          8          7 
## 
## $`1881-Garfield`
##       people   government       states constitution          can         upon        great        union          law       nation 
##           21           20           15           15           13           13           11           11           10            9 
## 
## $`1885-Cleveland`
##       people   government       public        shall constitution    interests        every     citizens       policy         upon 
##           16           16           11           10            8            7            5            5            5            5 
## 
## $`1889-Harrison`
##       people         upon       states        shall       public         laws          may        great constitution   government 
##           29           21           20           18           17           17           12           12           11           10 
## 
## $`1893-Cleveland`
##     people government         us        can      every     public   american    support   national    service 
##         19         13         10          9          9          8          8          7          6          6 
## 
## $`1897-McKinley`
##       upon     people government       must   congress      great    country        can     public      every 
##         31         25         23         23         18         16         14         13         13         12 
## 
## $`1901-McKinley`
## government     people       upon        now     united     states  executive   congress         us      shall 
##         13         12         11         10          9          9          9          9          8          7 
## 
## $`1905-Roosevelt`
##       us     life   people     must    great   nation problems      men    power    cause 
##       12        6        6        6        5        5        5        4        4        4 
## 
## $`1909-Taft`
## government   business       must        can        may       upon     proper   congress       race        law 
##         26         22         19         18         18         16         15         14         13         13 
## 
## $`1913-Wilson`
##      great government       life      every        men       upon    justice     things     nation      shall 
##         14          9          8          8          8          8          8          7          6          6 
## 
## $`1917-Wilson`
##    upon   shall      us purpose  action    life   world   peace   stand     can 
##      13       9       8       8       7       6       6       6       6       5 
## 
## $`1921-Harding`
##        world         must      america          war        never civilization          can          new        order          may 
##           23           23           15           13           12           12           11           11           10           10 
## 
## $`1925-Coolidge`
##        can    country       must      great     people government      world      peace       much       upon 
##         26         17         17         16         15         14         13         13         12         12 
## 
## $`1929-Hoover`
## government        can       upon   progress     people      world       must      peace    justice     nation 
##         24         17         17         16         15         15         15         15         14         12 
## 
## $`1933-Roosevelt`
##        can   national       must        may     people      shall leadership     helped     nation      world 
##         11          9          9          8          7          7          7          7          6          6 
## 
## $`1937-Roosevelt`
## government     people        can     nation       good        men        see  democracy      power   progress 
##         15         11          9          9          8          8          8          8          7          7 
## 
## $`1941-Roosevelt`
##    nation      know    spirit democracy      life        us    people   america     years   freedom 
##        11        10         9         9         8         8         7         7         6         6 
## 
## $`1945-Roosevelt`
##   shall   peace learned     men   today     can     way    test    life  fellow 
##       7       6       5       4       4       3       3       3       2       2 
## 
## $`1949-Truman`
##  nations    world      can    peace   people  freedom     free   united     must security 
##       22       22       16       14       12       12       11       10        9        9 
## 
## $`1953-Eisenhower`
##    free   world   faith   peace   shall      us  people    must    upon freedom 
##      21      14      13      12      11      11      10      10      10      10 
## 
## $`1957-Eisenhower`
##     may nations   world   peace freedom  people    seek     can    must    upon 
##      15      14      14      11      11      10      10       9       9       6 
## 
## $`1961-Kennedy`
##      let       us      can    world    sides      new   pledge citizens  nations     free 
##       16       12        9        8        8        7        7        5        5        5 
## 
## $`1965-Johnson`
##     us change nation   must people  union    man  world    old  every 
##     12     12     11     10      9      9      9      7      7      6 
## 
## $`1969-Nixon`
##     us    can people  world  peace    let   know    now   make  earth 
##     20     17     14     13     12     11     10      9      9      9 
## 
## $`1973-Nixon`
##             us            let          peace          world            new            can        america responsibility     government          great 
##             26             22             19             16             15             14             13             11             10              9 
## 
## $`1977-Carter`
##      can   nation      new     must       us   people together strength   spirit    human 
##       13       10        9        8        8        7        7        7        6        5 
## 
## $`1981-Reagan`
##         us government       must    believe     people  americans        one       time      world    freedom 
##         25         16         10         10          9          9          8          8          8          8 
## 
## $`1985-Reagan`
##         us     people      world        one government    freedom       must       time        now      human 
##         27         16         15         14         13         13         12         10         10          9 
## 
## $`1989-Bush`
##    new     us    can  great nation  world   free   must   hand   good 
##     14     13     11     10     10     10      9      9      8      8 
## 
## $`1993-Clinton`
##     world      must   america        us    people     today       new       let    change americans 
##        18        18        15        13        12        10         9         9         9         9 
## 
## $`1997-Clinton`
##     new      us century  nation    time   every  people america    land     one 
##      29      27      20      13      12      11      11      11      11      10 
## 
## $`2001-Bush`
##       us  country citizens    story   nation  america      can    every     must    never 
##       11        9        9        9        8        8        6        6        6        5 
## 
## $`2005-Bush`
##   freedom   liberty   america     every       one    nation   country     world americans america's 
##        25        15        12        10         9         9         8         8         8         8 
## 
## $`2009-Obama`
##      us     can  nation     new   every    must america  people    less     let 
##      23      13      12      11       8       8       8       7       7       7 
## 
## $`2013-Obama`
##       us     must   people     time      can    every together     make      one  country 
##       21       17       11       10        7        7        7        7        6        6 
## 
## $`2017-Trump`
##  america american   people  country      one    every    never    great   nation      new 
##       18       11       10        9        8        7        6        6        6        6 
## 
## $`2021-Biden.txt`
##        us   america       can       one    nation      must democracy    people   another  american 
##        27        18        16        15        12        10        10         9         9         9
# we can also compute topfeatures by any docvar
docvars(dfx)
topfeatures(dfx, n = 10, groups = Party)
## $Democratic
##         us     people        can government       must     nation      world        new      shall      every 
##        222        199        173        143        138        126        118        113        111        109 
## 
## $`Democratic-Republican`
## government      great     states        war        may     public      every         us      union    country 
##         68         61         56         51         49         48         45         44         42         40 
## 
## $Federalist
##       people   government          may      nations      country          can       states       nation constitution      foreign 
##           20           16           13           11            9            9            9            9            8            8 
## 
## $none
##        can      every government        may    present    country     public      shall   citizens     people 
##          9          9          9          7          6          6          6          6          5          5 
## 
## $Republican
##     people government        can         us       must       upon      world      great    country      peace 
##        264        240        228        218        201        192        180        159        147        139 
## 
## $Whig
##   government       states       people        power constitution          may         upon        union          one      country 
##           88           61           57           57           55           51           50           47           45           42

Level of analysis: sentence

Sometimes we want to analyze certain indicators on sentence-level. To show how to go about doing so, we will compute the per-sentence sentiment in Biden’s 2021 speech.

Reshape, Subset and Prepare Documents

## first step: extract Biden's speech from the corpus
biden <- corpus_subset(df, President == 'Biden')
## 2nd step: reshape corpus from full texts to sentences
sentences <- corpus_reshape(biden, to = 'sentences')
sentences
## Corpus consisting of 216 documents and 4 docvars.
## 2021-Biden.txt.1 :
## "Chief Justice Roberts, Vice President Harris, Speaker Pelosi..."
## 
## 2021-Biden.txt.2 :
## "This is America's day."
## 
## 2021-Biden.txt.3 :
## "This is democracy's day."
## 
## 2021-Biden.txt.4 :
## "A day of history and hope."
## 
## 2021-Biden.txt.5 :
## "Of renewal and resolve."
## 
## 2021-Biden.txt.6 :
## "Through a crucible for the ages America has been tested anew..."
## 
## [ reached max_ndoc ... 210 more documents ]
## 3rd step: within-sentence word tokenization
# tokenize
sentence_toks <- tokens(sentences, what = 'word',  remove_punct = T)
# make lower case
sentence_toks <- tokens_tolower(sentence_toks)
# remove stopwords
sentence_toks <- tokens_select(sentence_toks, pattern = stopwords("en"), selection = "remove")

Sentiment annotation

## select a sentiment dictionary
## we use the Proksch et al. (2015 dictionary native to quanteda)
data_dictionary_LSD2015
## Dictionary object with 4 key entries.
## - [negative]:
##   - a lie, abandon*, abas*, abattoir*, abdicat*, aberra*, abhor*, abject*, abnormal*, abolish*, abominab*, abominat*, abrasiv*, absent*, abstrus*, absurd*, abus*, accident*, accost*, accursed* [ ... and 2,838 more ]
## - [positive]:
##   - ability*, abound*, absolv*, absorbent*, absorption*, abundanc*, abundant*, acced*, accentuat*, accept*, accessib*, acclaim*, acclamation*, accolad*, accommodat*, accomplish*, accord, accordan*, accorded*, accords [ ... and 1,689 more ]
## - [neg_positive]:
##   - best not, better not, no damag*, no no, not ability*, not able, not abound*, not absolv*, not absorbent*, not absorption*, not abundanc*, not abundant*, not acced*, not accentuat*, not accept*, not accessib*, not acclaim*, not acclamation*, not accolad*, not accommodat* [ ... and 1,701 more ]
## - [neg_negative]:
##   - not a lie, not abandon*, not abas*, not abattoir*, not abdicat*, not aberra*, not abhor*, not abject*, not abnormal*, not abolish*, not abominab*, not abominat*, not abrasiv*, not absent*, not abstrus*, not absurd*, not abus*, not accident*, not accost*, not accursed* [ ... and 2,840 more ]
## apply dictionary to the Biden's speech
toks_lsd <- tokens_lookup(sentence_toks, dictionary = data_dictionary_LSD2015[1:2])
dfm_lsd <- dfm(toks_lsd)
## compute percentage of positive words per sentence
## over the course of the speech
# melt dfm to long table
df_lsd <- convert(dfm_lsd, to = "data.frame")
df_lsd
df_lsd <- melt(df_lsd, id.vars = 'doc_id', variable.name = 'sentiment', value.name = 'n')
head(df_lsd)
# group by sentence (doc_id) and compute percentages
df_lsd <- df_lsd %>% 
  group_by(doc_id) %>% 
  mutate(perc = n/sum(n))
head(df_lsd)
# give every sentence a numeric value corresponding to doc ID
df_lsd <- df_lsd %>% 
  ungroup %>% 
  mutate(num_id = as.numeric(stri_extract(doc_id, regex = '(?<=\\.)[0-9]+')))

Visualization

## plot results
ggplot(data = df_lsd, aes(x = num_id, y = perc, colour = sentiment, group = sentiment)) +
  geom_smooth() +
  #geom_point(alpha = 0.5) +
  theme_classic() +
  theme(plot.title = element_text(face = 'bold')) +
  scale_x_continuous(expand = c(0, 0)) +
  scale_y_continuous(expand = c(0.01, 0.01), labels = scales::percent) +
  labs(
    title = "Biden's 2021 speech: sentiment per sentence, smoothed",
    x = 'Sentence Number Within Speech',
    y = 'Frequency'
  )

## inspect sentence-tokens vectors 55-65
sentence_toks[55:65]
## Tokens consisting of 11 documents and 4 docvars.
## 2021-Biden.txt.1 :
## [1] "can"    "right"  "wrongs"
## 
## 2021-Biden.txt.2 :
## [1] "can"    "put"    "people" "work"   "good"   "jobs"  
## 
## 2021-Biden.txt.3 :
## [1] "can"      "teach"    "children" "safe"     "schools" 
## 
## 2021-Biden.txt.4 :
## [1] "can"      "overcome" "deadly"   "virus"   
## 
## 2021-Biden.txt.5 :
##  [1] "can"     "reward"  "work"    "rebuild" "middle"  "class"   "make"    "health"  "care"    "secure" 
## 
## 2021-Biden.txt.6 :
## [1] "can"     "deliver" "racial"  "justice"
## 
## [ reached max_ndoc ... 5 more documents ]
## uh.. this doesn't look like it's meant negatively.
## let's double check by reading the untokenized sentences
sentences[55:65]
## Corpus consisting of 11 documents and 4 docvars.
## 2021-Biden.txt.1 :
## "We can right wrongs."
## 
## 2021-Biden.txt.2 :
## "We can put people to work in good jobs."
## 
## 2021-Biden.txt.3 :
## "We can teach our children in safe schools."
## 
## 2021-Biden.txt.4 :
## "We can overcome this deadly virus."
## 
## 2021-Biden.txt.5 :
## "We can reward work, rebuild the middle class, and make healt..."
## 
## 2021-Biden.txt.6 :
## "We can deliver racial justice."
## 
## [ reached max_ndoc ... 5 more documents ]
## as expected, Biden is mentioning ISSUES, but in a combative way

Level of analysis: token windows

Let’s compare the word embeddings for ‘progress’, ‘spirit’, ‘world’, ‘nation’, ‘duty’, and ‘war’ between Democrats and Republicans. We define the embedding as a window of +/-10 words around these keywords.

Prep

## tokenization
toks <- tokens(df, remove_punct = T, remove_symbols = T, padding = F)
toks <- tokens_replace(toks,
                       pattern = '\\bUS\\b',
                       replacement = 'USA')
toks <- tokens_tolower(toks)
## lemmatizing
toks <- tokens_replace(toks, 
                       pattern = lexicon::hash_lemmas$token, 
                       replacement = lexicon::hash_lemmas$lemma)
## remove stopwords stopwords
# custom stopwords
cstmwrds <- c('upon', 'can', 'us', 'let', 'may', 'us', 'make',
              'must', 'many', 'shall', 'without', 'among',
              'much', 'every', 'ever', 'know', 'new', 'never',
              'year', 'find', 'see', 'good')
# remove them all
toks <- tokens_select(toks,  pattern = c(stopwords("en"), cstmwrds), selection = "remove")
## define the keywords
query <- c('progress', 'spirit', 'world', 'nation', 'duty', 'war')

Feature co-occurence matrix for ‘duty’

## subset data by the party-affiliation
demo <- tokens_subset(toks, Party == 'Democratic')
repub <- tokens_subset(toks, Party == 'Republican')
## select tokens within +/- 10 words around the keyword 'duty'
toks_demo <- tokens_select(demo, pattern = 'duty', selection = "keep", window = 10, padding = FALSE, verbose = TRUE)
toks_repub <- tokens_select(repub, pattern = 'duty', selection = "keep", window = 10, padding = FALSE, verbose = TRUE)
## create feature co-occurence matrix (fcm) with weights within the window
## the more distance between the words, the less weight the co-occurence gets
dfcmat_demo <- fcm(toks_demo, context = 'window', window = 10, count = 'weighted', tri = FALSE)
dfcmat_repub <- fcm(toks_repub, context = 'window', window = 10, count = 'weighted', tri = FALSE)
dfcmat_demo # have a look at one of the fcms
## Feature co-occurrence matrix of: 641 by 641 features.
##          features
## features  life     event day month       one      hand country voice health time
##   life       0 0           0     0 0         0               0     0      0    0
##   event      0 0           0     0 0.1428571 0               0     0      0    0
##   day        0 0           0     0 0         0               0     0      0    0
##   month      0 0           0     0 0         0               0     0      0    0
##   one        0 0.1428571   0     0 0         0.1666667       0     0      0    0
##   hand       0 0           0     0 0.1666667 0               0     0      0    0
##   country    0 0           0     0 0         0               0     0      0    0
##   voice      0 0           0     0 0         0               0     0      0    0
##   health     0 0           0     0 0         0               0     0      1    0
##   time       0 0           0     0 0         0               0     0      0    0
## [ reached max_feat ... 631 more features, reached max_nfeat ... 631 more features ]
## subset the fcm, by selecting those top 50 terms
dfcmat_demo <- fcm_select(dfcmat_demo, pattern =  names(topfeatures(dfcmat_demo, 51)), selection = "keep")
dfcmat_repub <- fcm_select(dfcmat_repub, pattern =  names(topfeatures(dfcmat_repub, 51)), selection = "keep")

Visualization

## compute varying word-label sizes for each term based on its frequency
label_sizes_demo <- rowSums(dfcmat_demo)/min(rowSums(dfcmat_demo))*1.2
## override the size for the term in <i> (cannibalizes the whole space of the plot)
label_sizes_demo['duty'] <- 0.1
set.seed(123) # set seed for repoducibility
p_demo <- quanteda.textplots::textplot_network(dfcmat_demo, 
                                              min_freq = 0.5,
                                              edge_alpha = 0.2,
                                              vertex_size = rowSums(dfcmat_demo)/min(rowSums(dfcmat_demo))/8,
                                              vertex_labelsize = label_sizes_demo,
                                              edge_color = 'dodgerblue') + 
      # additional theme tweaks
      labs(title = 'Democratic: duty') +
      theme(plot.title = element_text(face = 'bold'))
## same for republicans
label_sizes_repub <- rowSums(dfcmat_repub)/min(rowSums(dfcmat_repub))*1.2
label_sizes_repub['duty'] <- 0.1
p_repub <- quanteda.textplots::textplot_network(dfcmat_repub, 
                                              min_freq = 0.5,
                                              edge_alpha = 0.2,
                                              vertex_size = rowSums(dfcmat_repub)/min(rowSums(dfcmat_repub))/8,
                                              vertex_labelsize = label_sizes_repub,
                                              edge_color = 'firebrick') + 
      # additional theme tweaks
      labs(title = 'Republican: duty') +
      theme(plot.title = element_text(face = 'bold'))
## plot
grid.arrange(p_demo, p_repub, ncol = 2)

Serialize with a loop

## feature co-occurence matrix
container <- list() # plot-container
for(m in c('Democratic', 'Republican')){ # loop over party
  for(i in query){ # loop over keywords
    ## subset data by the party-affiliation in <m>
    toks_sel <- tokens_subset(toks, Party == m)
    ## select tokens within +/- 10 words around the keyword in <i>
    toks_sel <- tokens_select(toks_sel, pattern = i, selection = "keep", window = 10, 
                              padding = FALSE, verbose = TRUE)
    ## create feature co-occurence matrix (fcm) with weights within the window
    ## the more distance between the words, the less weight the co-occurence gets
    dfcmat <- fcm(toks_sel, context = 'window', window = 10, 
                  count = 'weighted', tri = FALSE)
    ## extract the names of the 50 top co-occurences in the embedding for the term in <i>
    feat <- names(topfeatures(dfcmat, 51))
    ## subset the fcm, by selecting those top 50 terms
    dfcmat_sel <- fcm_select(dfcmat, pattern = feat, selection = "keep")
    ## create plot
    # compute varying word-label sizes for each term based on its frequency
    label_sizes <- rowSums(dfcmat_sel)/min(rowSums(dfcmat_sel))*0.8
    # override the size for the term in <i> (cannibalizes the whole space of the plot)
    label_sizes[i] <- 0.1
    set.seed(123) # set seed for repoducibility
    p <- quanteda.textplots::textplot_network(dfcmat_sel, 
                                              min_freq = 0.5,
                                              edge_alpha = 0.2,
                                              vertex_size = rowSums(dfcmat_sel)/min(rowSums(dfcmat_sel))/8,
                                              vertex_labelsize = label_sizes,
                                              edge_color = ifelse(m=='Republican', 'firebrick', 'dodgerblue')) + 
      # additional theme tweaks
      labs(title = paste0(m, ': ', i)) +
      theme(plot.title = element_text(face = 'bold'))
    ## populate the container
    container[[paste0(m, ': ', i)]] <- p 
  }
}
## plot panel
names(container)
##  [1] "Democratic: progress" "Democratic: spirit"   "Democratic: world"    "Democratic: nation"   "Democratic: duty"     "Democratic: war"      "Republican: progress" "Republican: spirit"   "Republican: world"    "Republican: nation"   "Republican: duty"     "Republican: war"
grid.arrange(container[[1]], container[[7]], 
             container[[2]], container[[8]], 
             container[[3]], container[[9]],
             container[[4]], container[[10]], 
             container[[5]], container[[11]], 
             container[[6]], container[[12]],
             ncol = 2)

 




A work by Lucien Baumgartner & Kevin Reuter

https://lucienbaumgartner.github.io/" class="fa fa-home">